1 Set Up

1.1 R Code

#packages we need for this code file
library(ggplot2)
library(mgcv)
library(lubridate)
library(zoo)
library(tidyverse)
library(dplyr)
library(DHARMa)
library(mgcViz)
library(extrafont)
library(arm)
loadfonts()
library(stargazer)
library(ellipse)
library(dotwhisker)
library(countreg)
#define functions we will need for analysis
#expit function
expit<-function(x){
  return(exp(x)/(1 + exp(x)))
}

#logit function
logit<-function(x){
  return(log(x/(1 - x)))
}

1.2 Data

#read in data
main_analysis_data<-read.csv("./Data/full_data_set_11_29_21_unintentional.csv")

################################## set up data set ################################
#add the intervention dates and time period data
main_analysis_data$Intervention_First_Date<-as.Date(main_analysis_data$Intervention_First_Date)
main_analysis_data$Time_Period_Start<-as.Date(main_analysis_data$Time_Period_Start)
names(main_analysis_data)[which(colnames(main_analysis_data) == "sum_deaths")] <- "imputed_deaths"

################################## set up the Regions ##############################
#set up the regions according to Census: https://www.census.gov/geographies/reference-maps/2010/geo/2010-census-regions-and-divisions-of-the-united-states.html
NE.name <- c("Connecticut","Maine","Massachusetts","New Hampshire",
             "Rhode Island","Vermont","New Jersey","New York",
             "Pennsylvania")

MW.name <- c("Indiana","Illinois","Michigan","Ohio","Wisconsin",
             "Iowa","Kansas","Minnesota","Missouri","Nebraska",
             "North Dakota","South Dakota")

S.name <- c("Delaware","District of Columbia","Florida","Georgia",
            "Maryland","North Carolina","South Carolina","Virginia",
            "West Virginia","Alabama","Kentucky","Mississippi",
            "Tennessee","Arkansas","Louisiana","Oklahoma","Texas")

W.name <- c("Arizona","Colorado","Idaho","New Mexico","Montana",
            "Utah","Nevada","Wyoming","Alaska","California",
            "Hawaii","Oregon","Washington")

region.list <- list(
  Northeast=NE.name,
  Midwest=MW.name,
  South=S.name,
  West=W.name)

#initialize vector with "West" and then impute the other regions for the states
main_analysis_data$Region<-rep("West", nrow(main_analysis_data))
for(state in unique(main_analysis_data$State)){
  if(state %in% region.list$Northeast){
    main_analysis_data$Region[main_analysis_data$State == state]<-"Northeast"
  }else if(state %in% region.list$Midwest){
    main_analysis_data$Region[main_analysis_data$State == state]<-"Midwest"
  }else if(state %in% region.list$South){
    main_analysis_data$Region[main_analysis_data$State == state]<-"South"
  }
}

2 Sandwich Estimator Code

#here, we estimate the variance-covariance matrix through the sandwich estimator
#we create a function so that we don't have to keep writing the code:


compute_sd_and_CI_linear_link <- function(Z, log_observed_y, theta, z_value = 1.96, d,
                              return_full_cov = FALSE){
  #this function computes the sandwich estimator for models with linear link function 
  #Z: the data such that rows are state-time combinations and columns are the different policy measures
  #log_observed_y: log of the observed y from the data
  #theta: the coefficient values that need to be in order of the columns of cov_data
  #z_value the Z-value that corresponds to the CI. We default to 95% CI so we default to 1.96
  #d: the number of parameters for a bias correction
  #return_full_cov: a TRUE/FALSE boolean that indicates whether to return the full variance-covariance matrix 
  
  #compute the middle term of the Sandwich Estimator = sum_{s,t} Z_{s,t}*Z_{s,t}^T * (log(y_{s,t}) - Z_{s,t}^T*theta)^2 
  middle_term <- matrix(0, nrow = ncol(Z), ncol = ncol(Z))
  for(i in 1:nrow(Z)){
    #tcrossprod(X) = X*X^T
    middle_term <- middle_term + tcrossprod(as.matrix(Z[i,]))*
      as.numeric((log_observed_y[i] - t(as.matrix(Z[i,]))%*%theta)^2)
  }
  #compute C = sum_{s,t} Z_{s,t} * Z_{s,t}^T which is equal to Z^T * Z using matrix multiplication
  #we compute C^{-1} by using solve()
  C_inverse <- solve(crossprod(Z))
  
  #bias_term = (N)/(N-d) where N = nrow(Z)
  bias_term <- nrow(Z)/(nrow(Z) - d)
  
  Sigma_hat <- C_inverse%*%(middle_term)%*%t(C_inverse)*bias_term
  
  #we obtain the standard errors of the coefficients by taking the square root of the diagonal of the variance-covariance matrix.
  se_of_coefficients <- sqrt(diag(Sigma_hat))
  
  #find the CI for the coefficients
  lb_coef <- theta - z_value*(se_of_coefficients)
  ub_coef <- theta + z_value*(se_of_coefficients)
  
  #return_data_set returns the:
  #lb_coef: lower bounds of the coefficients
  #theta: coefficients
  #ub_coef: upper bounds of the coefficients
  #sd_coef = standard error of coefficients
  return_data_set <- data.frame(lb_coef, theta, ub_coef, se_coef = se_of_coefficients)
  
  #if user wants the full variance-covariance matrix, we return both the return_data_set and Sigma_hat
  if(return_full_cov){
    return(list(return_data_set = return_data_set, var_cov = Sigma_hat))
  }else{
    return(return_data_set)
  }
}

compute_sd_and_CI_logistic_link <- function(Z, observed_y, theta, z_value = 1.96, d,
                              return_full_cov = FALSE){
  #this function computes the sandwich estimator for models with logistic link function 
  #Z: the data such that rows are state-time combinations and columns are the different policy measures
  #log_observed_y: log of the observed y from the data
  #theta: the coefficient values that need to be in order of the columns of cov_data
  #z_value the Z-value that corresponds to the CI. We default to 95% CI so we default to 1.96
  #d: the number of parameters for a bias correction
  #return_full_cov: a TRUE/FALSE boolean that indicates whether to return the full variance-covariance matrix 
  
  #initialize the C_term and middle_term
  C_term <- middle_term <- matrix(0, nrow = ncol(Z), ncol = ncol(Z))
  for(i in 1:nrow(Z)){
    #p_{s,t} = expit(z_{s,t}^T Theta)
    p_st <- expit(t(as.matrix(Z[i,]))%*%theta)
    
    #middle term = sum_{s,t} Z_{s,t} * Z_{s,t}^T * (Y_{s,t} - p_{s,t})^2
    middle_term <- middle_term + tcrossprod(as.matrix(Z[i,]))*
      as.numeric((observed_y[i] - p_st)^2)
    
    #C_term = sum_{s,t} Z_{s,t} * Z_{s,t}^T * (p_{s,t}*(1-p_{s,t})) 
    C_term <- C_term + tcrossprod(as.matrix(Z[i,]))*as.numeric(p_st*(1 - p_st))
  }

  C_inverse <- solve(C_term)
  
  #bias_term = (N)/(N-d) where N = nrow(Z)
  bias_term <- nrow(Z)/(nrow(Z) - d)
  
  Sigma_hat <- C_inverse%*%as.matrix(middle_term)%*%t(C_inverse)*bias_term

  #we obtain the standard errors by taking the square root of the diagonal of the variance-covariance matrix.
  se_of_coefficients <- sqrt(diag(Sigma_hat))

  #find the CI for the coefficients
  lb_coef <- theta - z_value*(se_of_coefficients)
  ub_coef <- theta + z_value*(se_of_coefficients)
  
  #return_data_set returns the:
  #lb_coef: lower bounds of the coefficients
  #theta: coefficients
  #ub_coef: upper bounds of the coefficients
  #exp_lb: exponential of lower bounds of the coefficients
  #exp_coef = exponential of theta
  #exp_ub: exponential of upper bounds of the coefficients
  #sd_coef = standard error of coefficients
  return_data_set <- data.frame(lb_coef, 
                                theta, 
                                ub_coef,
                                exp_lb = exp(lb_coef), 
                                exp_coef = exp(theta),
                                exp_ub = exp(ub_coef), 
                                se_coef = se_of_coefficients)

  
  #if user wants the full variance-covariance matrix, we return both the return_data_set and Sigma_hat
  if(return_full_cov){
    return(list(return_data_set = return_data_set, var_cov = Sigma_hat))
  }else{
    return(return_data_set)
  }
}

3 Attributable Deaths Computation

attr_death_compute <- function(data, coef_data,  tx_name = NULL, 
                               var_cov_mat_beta = NULL, prob_of_od_var = "prop_dead"){
  attr_table <- data.frame(matrix(NA, nrow = unique(data$Time_Period_ID), ncol = 4)) 
  #filter data so that it's only states where there was treatment
  treated_data <- data %>%
    filter(Intervention_Redefined > 0)
  
  for(time in unique(treated_data$Time_Period_ID)){
    #filter treated_data to time period t
    time_data <- treated_data %>%
      filter(Time_Period_ID == time)
    
    #obtain the population
    pop <- time_data$population
    
    #obtain the estimated probability had intervention not occurred
    #here, we compute x^T*beta where x is a vector of the covariates and beta is the corresponding coefficients
    est_prob_no_int <- time_data[,prob_of_od_var]*exp(- as.matrix(time_data[,tx_name])%*%as.matrix(coef_data[tx_name, "estimate"]))
    
    #estimated number of OD had intervention not occurred
    n_od_no_int <- pop*est_prob_no_int
    
    #obtain LB and UB
    if(length(tx_name) == 1){
      #if there is only one variable corresponding to treatment, we don't need to account for covariances
      est_prob_no_int_lb <- time_data[,prob_of_od_var]*exp(- as.matrix(time_data[,tx_name])%*%as.matrix(coef_data[tx_name, "conf.low"]))
      n_attr_od_lb <- sum(pop*est_prob_no_int_lb) - sum(time_data$imputed_deaths)
      
      est_prob_no_int_ub <- time_data[,prob_of_od_var]*exp(-as.matrix(time_data[,tx_name])%*%as.matrix(coef_data[tx_name, "conf.high"]))
      n_attr_od_ub <- sum(pop*est_prob_no_int_ub) - sum(time_data$imputed_deaths)
    }else{
      #otherwise, if there are more than 1 treatment parameters, we need to account for covariances
      #here we assume its two treatment parameters 
      jacobian_first_entry <- sum(time_data$population*time_data[,prob_of_od_var]*
                                    exp(-as.matrix(time_data[,tx_name])%*%as.matrix(coef_data[tx_name,"estimate"]))*
                                    time_data[,tx_name[1]])
      jacobian_second_entry <- sum(time_data$population*time_data[,prob_of_od_var]*
                                     exp(-as.matrix(time_data[,tx_name])%*%as.matrix(coef_data[tx_name,"estimate"]))*
                                     time_data[,tx_name[2]])
      
      jacobian_mat <- matrix(c(jacobian_first_entry, jacobian_second_entry), nrow = 1)
      
      computed_sd <- sqrt(tcrossprod(jacobian_mat%*%var_cov_mat_beta, jacobian_mat)) 
      n_attr_od_lb <- sum(n_od_no_int - time_data$imputed_deaths) - 1.96*computed_sd
      n_attr_od_ub <- (sum(n_od_no_int - time_data$imputed_deaths)) + 1.96*computed_sd
      
    }
    
    attr_table[time,] <- c(time, sum(n_od_no_int - time_data$imputed_deaths), 
                           (n_attr_od_lb) , 
                           (n_attr_od_ub) )
    
    
  }
  colnames(attr_table) <- c("Time_Period", "attr_deaths", "attr_deaths_lb", "attr_deaths_ub")
  attr_table
}

4 Event Study Data Creation

main_analysis_data$prop_dead <- main_analysis_data$imputed_deaths/main_analysis_data$population

#create the dataset for the event study to check for pre-trend analysis

#First create dataset that indicates the intervention time of each state. 
#We then indicate the ID of the six-month time period that the intervention time is in so that we can use this to compare time points
time_data_int <- main_analysis_data %>%
  #group by the state
  group_by(State) %>%
  #find the time interval ID for the intervention time
  #we do this by finding the six-month time ID of the intervention date using floor_date(, "6 months")
  summarise(intervention_time_id = ifelse(floor_date(Intervention_First_Date, "6 months") == Time_Period_Start, Time_Period_ID, NA)) %>%
  #filter out the other six-month time periods that aren't the intervention date
  filter(!is.na(intervention_time_id))

#merge the time_data_int with the main dataset by state
merged_main_time_data_int <- merge(main_analysis_data, time_data_int, by = "State", all.x = TRUE)

#create the columns that associate with the periods before the intervention
#we call these negative periods since it is negative relative to the intervention time
#neg_periods_df is a dataset of indicators of x periods before intervention, where rows are equal to number of periods before intervention 
#the maximum number of periods pre-intervention is equal to max(intervention_time_id) - 1
neg_periods_df <- sapply(1:(max(merged_main_time_data_int$intervention_time_id, na.rm = TRUE) - 1),
    #the indicator for x periods before intervention is equal to 1 if the time ID of intervention minus time ID is equal to x
    #otherwise, it is 0
                         function(x){ifelse(merged_main_time_data_int$intervention_time_id - 
                                              merged_main_time_data_int$Time_Period_ID == x, 1, 0)})

#create the column names for neg_periods_df of the form: "neg_x_pd"
colnames(neg_periods_df) <- sapply(1:(max(merged_main_time_data_int$intervention_time_id, na.rm = TRUE) - 1), 
                                   function(x){paste("neg_", x, "_pd", sep = "")})
#add in the state and time ID columns to neg_periods_df
neg_periods_df <- cbind(neg_periods_df, "State" = merged_main_time_data_int$State, 
                        "Time_Period_ID" = merged_main_time_data_int$Time_Period_ID)
#for Hawaii, impute a 0 because it is NA right now
neg_periods_df[neg_periods_df[,"State"] == "Hawaii", 1:34] <- 0

#create the columns that associate with the periods after the intervention
#we call these positive periods since it is positive relative to the intervention time
#pos_periods_df is a dataset of indicators of x periods after intervention, where rows are equal to number of periods after intervention 
#the max number of periods after the intervention is determined by the maximum Time ID minus the earliest time period of the intervention
#the period 0 is associated with intervention time
pos_periods_df <- sapply(0:(max(merged_main_time_data_int$Time_Period_ID) - 
                              min(merged_main_time_data_int$intervention_time_id, na.rm = TRUE)),
                         function(x){ifelse(merged_main_time_data_int$Time_Period_ID - 
                                              merged_main_time_data_int$intervention_time_id == x, 1, 0)})
#create the column names for pos_periods_df of the form: "pos_x_pd"
colnames(pos_periods_df) <- sapply(0:(max(merged_main_time_data_int$Time_Period_ID) - 
                                        min(merged_main_time_data_int$intervention_time_id, na.rm = TRUE)), 
                                   function(x){paste("pos_", x, "_pd", sep = "")})
#add in the state and time ID columns
pos_periods_df <- cbind(pos_periods_df, "State" = merged_main_time_data_int$State, 
                        "Time_Period_ID" = merged_main_time_data_int$Time_Period_ID)
#for Hawaii, impute a 0 because it is NA right now
pos_periods_df[pos_periods_df[,"State"] == "Hawaii", 1:40] <- 0

#merge the columns of indicators for before and after the intervention with the main analysis data to create the dataset for event study
sensitivity_anlys_event_study_data <- merge(main_analysis_data, 
                                            neg_periods_df, by = c("State", "Time_Period_ID"))

sensitivity_anlys_event_study_data <- merge(sensitivity_anlys_event_study_data, 
                                            pos_periods_df, by = c("State", "Time_Period_ID"))
#change the indicator values to numeric type 
#we first get the index of the new columns
neg_1_index <- which(colnames(sensitivity_anlys_event_study_data) == "neg_1_pd")
pos_39_index <- which(colnames(sensitivity_anlys_event_study_data) == "pos_39_pd")

#then turn the values to numeric
sensitivity_anlys_event_study_data[, neg_1_index:pos_39_index] <- apply(sensitivity_anlys_event_study_data[, neg_1_index:pos_39_index], 
                                                                        2, as.numeric)